from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
-> From ROOT TTree to h5py file
import pandas as pd
import numpy as np
import os
import h5py
import pickle
import matplotlib
import matplotlib.pyplot as plt
import uproot
data_dir = '/data_CMS/cms/portales/HHresonant_SKIMS/trimmed_samples/'
work_dir = '/data_CMS/cms/portales/HHresonant_DANN/'
mass_point = 'All' # Resonance mass in GeV
sig_in = data_dir + 'Radion' + mass_point + '_allperiods_resDNN_selfix.root'
#sig_in = data_dir + 'Radion' + mass_point + '_2018_allperiods_resDNN_selfix.root'
bkg_in = data_dir + 'TT_all_sel.root'
#bkg_in = data_dir + 'DY_all_sel.root'
sigtree = uproot.open(sig_in)['HTauTauTree']
bkgtree = uproot.open(bkg_in)['HTauTauTree']
def get_planing_weight(mass_dist):
# Retrieve upper and lower bounds
min_mass, max_mass = np.min(mass_dist), np.max(mass_dist)
diff = max_mass - min_mass
min_mass, max_mass = min_mass - 0.10*diff, max_mass + 0.10*diff
# split in bins
nbins = np.linspace(min_mass,max_mass,1000)
# get bins centers
masspts = 0.5*(nbins[1:] + nbins[:-1])
# get fraction of entries per bin
vals, _ = np.histogram(mass_dist, bins=nbins, density=True)
# interpolate
from scipy import interpolate
interp = interpolate.interp1d(masspts, vals, bounds_error=False)
# Compute weights
probs= interp(mass_dist)
weights = np.nansum(probs)/probs
weights/=np.nanmean(weights)
return weights
inFeatures = [
'HHKin_mass_raw',
'mT1',
'mT2',
# 'dau1_pt',
'dau1_decayMode',
'dau2_decayMode',
# 'bjet1_pt',
'bjet1_CvsL',
'bjet1_CvsB',
'tauH_SVFIT_mass',
'tauHsvfitMet_deltaPhi',
# 'ditau_deltaR',
'bH_mass',
'HHKin_chi2',
'HHsvfit_deltaPhi',
'ditau_deltaR_per_tauH_MET_pt',
#'DNNoutSM_kl_1'
]
fourVect = [
'METx',
'METy',
'dau1_pt',
'dau1_eta',
'dau1_phi',
'dau1_e',
'dau2_pt',
'dau2_eta',
'dau2_phi',
'dau2_e',
'bjet1_pt',
'bjet1_eta',
'bjet1_phi',
'bjet1_e',
'bjet2_pt',
'bjet2_eta',
'bjet2_phi',
'bjet2_e',
'VBFjet1_pt',
'VBFjet1_eta',
'VBFjet1_phi',
'VBFjet1_e',
'VBFjet2_pt',
'VBFjet2_eta',
'VBFjet2_phi',
'VBFjet2_e',
]
#TODO: remove unwanted weights from list, check weights applied only to specific samples/regions
eventWeights = [
'MC_weight',
'totalWeight',
'prescaleWeight',
'L1pref_weight',
'PUjetID_SF',
'PUReweight',
'bTagweightL',
'bTagweightM',
'bTagWeightT',
'TTtopPtreweight',
'trigSF',
'trigSF_single',
'trigSf_cross',
'VBFtrigSF',
'FakeRateSF',
'jetFakeSF',
'IdAndIsoSF_MVA',
'IdAndIsoSF_deep',
'IdAndsoSF_deep_pt',
'IdAndIsoAndFakeSF_MVA',
'IdAndIsoSFAndFakeSF_deep',
'IdAndsoSFAndFakeSF_deep_pt',
'FakeRateSF_MVA',
'FakeRateSF_deep',
'customTauIdSF',
'DYscale_LL',
'DYscale_MM',
'DYscale_MH',
'DYscale_MTT',
]
cuts = '(HHKin_mass_raw>0) & (HHKin_mass_raw<1500.)'
### turn TTree into pandas dataframes
sigEvents = sigtree.arrays(inFeatures+fourVect, cuts, library='pd')
bkgEvents = bkgtree.arrays(inFeatures+fourVect, cuts, library='pd')
### Add class labels
sigEvents.insert(0,'Class', 1, True)
bkgEvents.insert(0,'Class', 0, True)
### Add "Mass bin" classes (TODO: add binning optimiser here too !)
HHmass_sig = pd.cut(sigEvents.HHKin_mass_raw, bins=[-1, 258, 265, 271, 277, 283, 289, 295, 302, 309, 317, 325, 335, 346, 358, 374, 394, 420, 459, 531, 999999], labels=range(20))
HHmass_bkg = pd.cut(bkgEvents.HHKin_mass_raw, bins=[-1, 258, 265, 271, 277, 283, 289, 295, 302, 309, 317, 325, 335, 346, 358, 374, 394, 420, 459, 531, 999999], labels=range(20))
sigEvents.insert(0,'massBin', HHmass_sig, True)
bkgEvents.insert(0,'massBin', HHmass_bkg, True)
### Get labels in separated arrays
sigEvents_labels = np.ravel(sigEvents.Class)
bkgEvents_labels = np.ravel(bkgEvents.Class)
sigEvents_massBin = np.ravel(HHmass_sig)
bkgEvents_massBin = np.ravel(HHmass_bkg)
### Get Planing weights
sigEvents_planWeights = get_planing_weight(np.array(sigEvents.HHKin_mass_raw))
bkgEvents_planWeights = get_planing_weight(np.array(bkgEvents.HHKin_mass_raw))
sigEvents.insert(0,'w_planing', sigEvents_planWeights)
bkgEvents.insert(0,'w_planing', bkgEvents_planWeights)
Events = pd.concat([sigEvents,bkgEvents])
from lumin.data_processing.hep_proc import delta_r
addFeatures = ['dphi_tautau', 'dphi_bb', 'dphi_VBF',
'deta_tautau', 'deta_bb', 'deta_VBF',
'dr_tautau', 'dr_bb', 'dr_VBF']
Events[ "dphi_tautau" ] = np.abs( Events.dau1_phi - Events.dau2_phi )
Events[ "dphi_bb" ] = np.abs( Events.bjet1_phi - Events.bjet2_phi )
Events[ "dphi_VBF" ] = np.abs( Events.VBFjet1_phi - Events.VBFjet2_phi )
Events[ "deta_tautau" ] = np.abs( Events.dau1_eta - Events.dau2_eta )
Events[ "deta_bb" ] = np.abs( Events.bjet1_eta - Events.bjet2_eta )
Events[ "deta_VBF" ] = np.abs( Events.VBFjet1_eta - Events.VBFjet2_eta )
Events[ "dr_tautau" ] = np.abs(delta_r( Events.dphi_tautau, Events.deta_tautau ))
Events[ "dr_bb" ] = np.abs(delta_r( Events.dphi_bb, Events.deta_bb ))
Events[ "dr_VBF" ] = np.abs(delta_r( Events.dphi_VBF, Events.deta_VBF ))
targ_class = 'Class'
targ_mass = 'massBin'
weight_feat = 'w_planing'
cat_feats = ['dau1_decayMode','dau2_decayMode']
base_cont_feats = [f for f in inFeatures if f not in cat_feats]
fourVec_cont_feats = [f for f in fourVect if f not in cat_feats+base_cont_feats]
add_cont_feats = [f for f in addFeatures if f not in cat_feats+base_cont_feats+fourVec_cont_feats]
cont_feats = base_cont_feats+add_cont_feats+fourVec_cont_feats
train_feats = cat_feats+cont_feats
print(cat_feats)
print(cont_feats)
from lumin.plotting.data_viewing import plot_feat
for feature in train_feats:
#for feature in cont_feats:
#for feature in cat_feats:
plot_feat(Events, feature, cuts=[(Events.Class==0),(Events.Class==1)], labels=['Bkg.','Sig'], wgt_name='w_planing')
plt.figure(figsize=(32, 24))
import seaborn as sns
corr_sig = Events[train_feats].corr()
sns.heatmap(corr_sig, annot=True, xticklabels=train_feats, yticklabels=train_feats)
plt.show()
from sklearn.model_selection import train_test_split
Events_train, Events_test = train_test_split(Events, test_size=0.2, stratify=Events.Class, random_state=42)
from lumin.utils.data import check_val_set
check_val_set(Events_train[train_feats], Events_test[train_feats])
from lumin.data_processing.pre_proc import fit_input_pipe
input_pipe = fit_input_pipe(Events_train, cont_feats, '/data_CMS/cms/portales/HHresonant_DANNlumin/input_pipe')
Events_train[cont_feats] = input_pipe.transform(Events_train[cont_feats])
Events_test[cont_feats] = input_pipe.transform(Events_test[cont_feats])
for feature in train_feats:
#for feature in cont_feats:
#for feature in cat_feats:
plot_feat(Events_train, feature, cuts=[(Events_train.Class==0),(Events_train.Class==1)], labels=['Bkg.','Sig'], wgt_name='w_planing')
from lumin.data_processing.pre_proc import proc_cats
cat_maps, cat_szs = proc_cats(Events_train, cat_feats, Events_test)
cat_maps, cat_szs
from lumin.data_processing.file_proc import df2foldfile
df2foldfile(df=Events_train, n_folds=10,
cont_feats=cont_feats, cat_feats=cat_feats, cat_maps=cat_maps, targ_feats=[targ_class,targ_mass],
wgt_feat=weight_feat, misc_feats='',
savename='/data_CMS/cms/portales/HHresonant_DANNlumin/train', targ_type='float32', strat_key='Class')
df2foldfile(df=Events_test, n_folds=10,
cont_feats=cont_feats, cat_feats=cat_feats, cat_maps=cat_maps, targ_feats=[targ_class,targ_mass],
wgt_feat=weight_feat, misc_feats='',
savename='/data_CMS/cms/portales/HHresonant_DANNlumin/test', targ_type='float32', strat_key='Class')
# this takes a little while
from lumin.optimisation.features import rf_rank_features
trainFeats_tested = ['HHKin_chi2', 'tauH_SVFIT_mass', 'dau2_pt', 'HHKin_mass_raw', 'dr_tautau', 'tauHsvfitMet_deltaPhi', 'ditau_deltaR_per_tauH_MET_pt', 'bH_mass', 'dau1_pt', 'mT1', 'dau1_decayMode', 'bjet1_pt', 'bjet1_e', 'dr_bb', 'dau1_e']
import_feats = rf_rank_features(Events_train, Events_test, objective='classification', train_feats=trainFeats_tested, targ_name='Class',
importance_cut=0.001, n_rfs=5, wgt_name='w_planing')